suppressPackageStartupMessages({
library(tidyverse)
library(scran)
library(scater)
library(patchwork)
library(scDblFinder)
library(cowplot)
library(pheatmap)
library(edgeR)
library(ggrepel)
})

Loading Data

data.dir <- "Files_already_generated_with_Sevenbridges/D2/"

data <- read.csv(paste(data.dir, "Combined_CARNK_Sample2_RSEC_MolsPerCell.csv", sep = ""), skip = 7)
data$Cell_Index <- paste(data$Cell_Index, "_D2", sep = "")

The metadata contains Sample_Tag (nucleotide labeled antibody used to distinguish between samples), Sample_Name (patient id), multiplet status assigned by the SevenBridges pipeline

# donor 2
## table with cell information (sample barcode, donor, ...)
coldata <- read.csv(paste(data.dir, "CARNK_Sample2_Sample_Tag_Calls.csv", sep = ""), skip = 7) %>%
  dplyr::mutate(invalid = (Sample_Name == "Multiplet" | Sample_Name == "Undetermined")) %>%
  dplyr::mutate(donor = "D2")
coldata$Cell_Index <- paste(coldata$Cell_Index, "_D2", sep = "")

SevenBridges Output

SevenBridges pre-processing pipeline report:

sb <- data.frame(Donor = "D2",
                 Number.of.Cells = dim(data)[1],
                 Number.of.Features = dim(data[, -1])[2],
                 Median.Number.of.Features = median(rowSums(data > 0)),
                 stringsAsFactors = F)

DT::datatable(sb, 
              class = 'compact stripe hower',
              options = list(
                dom = "Bfrtip",
                scrollX = TRUE,
                paging = FALSE,
                searching = FALSE,
                info = FALSE,
                ordering = FALSE,
                columnDefs = list(list(className = 'dt-center', targets = 0:3))), rownames = FALSE)

Sample IDs

Number of cells in each sample:

table(coldata$Sample_Name)
## 
##   CARKOafter  CARKObefore     CARafter    CARbefore      KOafter     KObefore 
##         2740         2081         2866         2434         2719         2349 
##    Multiplet      NTafter     NTbefore Undetermined 
##         2766         2880         2359          184

We define a new column containing the group of samples:

  • BeforeCoc = Before
  • AfterCoc = After
  • NA = Multiplets + Undetermined
coldata <- coldata %>%
  dplyr::mutate(group = ifelse(grepl("before", Sample_Name), "BeforeCoc",
    ifelse(grepl("after", Sample_Name), "AfterCoc", NA)))

p1 <- coldata %>%
  ggplot(aes(x = donor, fill = group)) + 
  geom_bar() +
  labs(x = "Donor", fill = "Group")

p2 <- coldata %>%
  ggplot(aes(x = donor, fill = Sample_Name)) + 
  geom_bar() +
  labs(x = "Donor", fill = "Sample Name")

p3 <- coldata %>%
  ggplot(aes(x = group, fill = Sample_Name)) + 
  geom_bar() +
  labs(x = "Group", fill = "Sample Name")

p1 + p2 + p3

coldata %>%
  ggplot(aes(x = donor, fill = group, color = Sample_Name)) +
  geom_bar() +
  labs(x = "Donor", fill = "Group", color = "Sample Name")

Pre-processing Workflow

Filtering

data_t <- data[, -1] %>% t()
rownames(data_t) <- colnames(data[, -1])
colnames(data_t) <- data$Cell_Index

sce <- SingleCellExperiment(assays = list(counts = data_t), colData = coldata)
rowData(sce)$Type <- ifelse(grepl("abo", rownames(sce), ignore.case = TRUE), "Antibody Capture", "Gene Expression") # 'Type' is either gene or antibody
# removing the cells with multiplet or undetermined assignment
sce <- sce[, !coldata$invalid]

coldata <- coldata %>%
  dplyr::filter(!invalid)
# splitting protein from RNA
sce_split <- splitAltExps(sce, rowData(sce)$Type)
# altExpNames(sce_split)

Overall Quality Metrics

Before Filtering

# ADT vs RNA counts
rna_numi <- colSums(counts(sce_split))
adt_numi <- colSums(counts(altExp(sce_split)))
nUMIdf <- data.frame(rna_nUMI = rna_numi,
                     adt_nUMI = adt_numi)

ggplot(nUMIdf, aes(x = log10(adt_nUMI+1), y = log10(rna_nUMI+1))) + 
  geom_point(alpha = 0.1, size = 1) +
  geom_density_2d(color = "orange") +
  labs(x = "ADT Counts", y = "RNA Counts")

Computing QC metrics

outliers <- perCellQCMetrics(sce_split)
libsize.drop <- isOutlier(outliers$sum, log = TRUE, type = "both", nmads = 3)
feature.drop <- isOutlier(outliers$detected, log = TRUE, type = "lower", nmads = 3)
print(data.frame(ByLibSize = sum(libsize.drop),
                 ByFeature = sum(feature.drop),
                 Discard = sum(libsize.drop | feature.drop)))
##   ByLibSize ByFeature Discard
## 1       129       102     231
outliers.discard <- libsize.drop | feature.drop

#outliers.discard <- quickPerCellQC(outliers)
#colSums(as.matrix(outliers.discard))

Ab Counts

ab.discard <- isOutlier(outliers$`altexps_Antibody Capture_detected`,
                        log = TRUE,
                        type = "lower",
                        min_diff = 1)
summary(ab.discard)
##    Mode   FALSE 
## logical   20428
hist(outliers$`altexps_Antibody Capture_detected`, 
     col = 'grey',
     main = "", xlab = "Number of detected ADTs")

QC Plots

colData(sce_split) <- cbind(colData(sce_split), outliers) # adding the outliers info (sum, detected) to the sce object coldata
sce_split$discard <- outliers.discard | ab.discard # adding the cells to be discarded to the sce object coldata

plotColData(sce_split, x = "detected", y = "sum", colour_by = "discard") +
  theme(panel.border = element_rect(color = "grey")) + 
  labs(x = "Number of Features Detected", y = "Library Size")

gridExtra::grid.arrange(
  plotColData(sce_split, y="sum", colour_by="discard") +
    scale_y_log10() + ggtitle("Library Size"),
  plotColData(sce_split, y="detected", colour_by="discard") +
    scale_y_log10() + ggtitle("Genes Detected") + labs(y = "genes detected"),
  plotColData(sce_split, y="altexps_Antibody Capture_detected", colour_by="discard") +
    scale_y_log10() + ggtitle("Antibodies Detected") + labs(y = "antibodies detected"),
  ncol=1
)

# ADT vs RNA counts
rna_numi <- colSums(counts(sce_split))
adt_numi <- colSums(counts(altExp(sce_split)))
nUMIdf <- data.frame(rna_nUMI = rna_numi,
                     adt_nUMI = adt_numi, 
                     Discard = sce_split$discard)

ggplot(nUMIdf, aes(x = log10(adt_nUMI+1), y = log10(rna_nUMI+1), color = Discard)) + 
  geom_point(alpha = 0.1, size = 1) +
  geom_density_2d(color = "black") +
  labs(x = "ADT Counts", y = "RNA Counts") +
  scale_color_manual(values = c("#0072B2", "#C4961A"))

# filtering; removing outliers
sce_split <- sce_split[, !sce_split$discard]
sce_split
## class: SingleCellExperiment 
## dim: 431 20197 
## metadata(0):
## assays(1): counts
## rownames(431): ADA ADGRE1 ... ZBTB16 ZNF683
## rowData names(2): Type scDblFinder.selected
## colnames(20197): 697359_D2 864267_D2 ... 314351_D2 855920_D2
## colData names(17): Cell_Index Sample_Tag ... total discard
## reducedDimNames(0):
## mainExpName: Gene Expression
## altExpNames(1): Antibody Capture
table(coldata$Sample_Name)
## 
##  CARKOafter CARKObefore    CARafter   CARbefore     KOafter    KObefore 
##        2740        2081        2866        2434        2719        2349 
##     NTafter    NTbefore 
##        2880        2359

Normalization

# RNA

## size factors
lib.sf <- librarySizeFactors(sce_split)
summary(lib.sf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2150  0.5897  0.8379  1.0000  1.2181  4.2694
lib.sf.df <- data.frame(lib.sf, Group = colData(sce_split)$group)

## plotting size factors
hist(log10(lib.sf), xlab="Log10[Size Factor]", col='grey80')

ggplot(lib.sf.df, aes(log10(lib.sf))) +  
  geom_histogram() + 
  facet_wrap(~Group, scales = "free") + 
  labs(x = "Log10[Size Factor]")

## normalization
sce_split <- logNormCounts(sce_split)
#assayNames(sce_split)

# ADT

clr_norm <- function(x) {
  return(log1p(x = x / (exp(x = sum(log1p(x = x[x > 0]), na.rm = TRUE) / length(x = x)))))
}

logcounts(altExp(sce_split)) <- apply(counts(altExp(sce_split)), 2, clr_norm)

Variance Modeling

# determining variance components
dec <- modelGeneVar(sce_split)
dec[order(dec$bio, decreasing = TRUE), ]
## DataFrame with 431 rows and 6 columns
##             mean     total      tech       bio     p.value         FDR
##        <numeric> <numeric> <numeric> <numeric>   <numeric>   <numeric>
## CCL4     4.51591   3.75062  0.850912   2.89971 2.79404e-33 1.03100e-30
## IL32     4.20734   3.26054  0.773740   2.48680 8.02914e-30 1.48138e-27
## CCL3     3.24819   2.92965  1.030847   1.89880 5.03180e-11 4.64184e-09
## FCGR3A   2.84808   2.97694  1.126733   1.85020 4.09905e-09 3.02510e-07
## CCL5     3.89221   2.44580  0.785547   1.66025 5.89577e-14 7.25180e-12
## ...          ...       ...       ...       ...         ...         ...
## PTPRC    2.20790  0.787051  1.192988 -0.405937    0.883852    0.944507
## TRBC2    2.59636  0.724354  1.159376 -0.435022    0.906107    0.952574
## LAMP1    2.08797  0.756541  1.203842 -0.447301    0.903936    0.952574
## HLA.A    4.83744  0.338797  0.953669 -0.614872    0.988191    0.990509
## HLA.C    5.64365  0.362781  1.005275 -0.642494    0.987569    0.990509
fit <- metadata(dec)
plot(fit$mean, fit$var, 
     xlab = "Mean of log-expression", ylab = "Variance of log-expression")
points(fit$mean, fit$var, col = "red", pch = 16)
curve(fit$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)

Distribution of ADT Markers

Looking at the distribution of ADT markers (after normalization):

p <- do.call(plot_grid, c(lapply(rownames(logcounts(altExp(sce_split))), function(adt_name){
  ggplot(as.data.frame(logcounts(altExp(sce_split))[adt_name, ]), aes_string(x=logcounts(altExp(sce_split))[adt_name, ])) +
    geom_histogram(color="black",
                   fill = "black",
                   breaks=seq(0, 4.5, by=0.10)) + 
    xlab(adt_name) + 
    theme(axis.title.x = element_text(size = 8, face="bold"))}), ncol = 4))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## i Please use tidy evaluation ideoms with `aes()`
print(p)

We define a new column containing the condition of samples:

  • NT = NTbefore + NTafter
  • KO = KObefore + KOafter
  • CAR = CARbefore + CARafter
  • CARKO = CARKObefore + CARKOafter
# creating new colData
sce_split$CAR <- ifelse(sce_split$Sample_Name %in% c("CARKOafter", "CARKObefore", "CARafter", "CARbefore"), "Yes", "No")
sce_split$KO <- ifelse(sce_split$Sample_Name %in% c("CARKOafter", "CARKObefore", "KOafter", "KObefore"), "Yes", "No")
sce_split$condition <- ifelse(sce_split$Sample_Name %in% c("CARKOafter", "CARKObefore"), "CARKO",
                                 ifelse(sce_split$Sample_Name %in% c("CARafter", "CARbefore"), "CAR",
                                        ifelse(sce_split$Sample_Name %in% c("KOafter", "KObefore"), "KO", "NT")))
sce_split$condition <- factor(sce_split$condition, levels = c("NT", "KO", "CAR", "CARKO"))
saveRDS(sce_split, "intermediate_files/D2/sce_donor2.Rds")

Session

date()
## [1] "Thu Jan 26 11:21:13 2023"
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=C                 LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggrepel_0.9.2               edgeR_3.38.4               
##  [3] limma_3.52.4                pheatmap_1.0.12            
##  [5] cowplot_1.1.1               scDblFinder_1.10.0         
##  [7] patchwork_1.1.2             scater_1.24.0              
##  [9] scran_1.24.1                scuttle_1.6.3              
## [11] SingleCellExperiment_1.18.1 SummarizedExperiment_1.26.1
## [13] Biobase_2.56.0              GenomicRanges_1.48.0       
## [15] GenomeInfoDb_1.32.4         IRanges_2.30.1             
## [17] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [19] MatrixGenerics_1.8.1        matrixStats_0.63.0         
## [21] forcats_0.5.2               stringr_1.5.0              
## [23] dplyr_1.0.10                purrr_1.0.1                
## [25] readr_2.1.3                 tidyr_1.3.0                
## [27] tibble_3.1.8                ggplot2_3.4.0              
## [29] tidyverse_1.3.2            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1              backports_1.4.1          
##   [3] igraph_1.3.5              BiocParallel_1.30.4      
##   [5] crosstalk_1.2.0           digest_0.6.31            
##   [7] htmltools_0.5.4           viridis_0.6.2            
##   [9] fansi_1.0.4               magrittr_2.0.3           
##  [11] ScaledMatrix_1.4.1        googlesheets4_1.0.1      
##  [13] cluster_2.1.4             tzdb_0.3.0               
##  [15] Biostrings_2.64.1         modelr_0.1.10            
##  [17] timechange_0.2.0          colorspace_2.1-0         
##  [19] rvest_1.0.3               haven_2.5.1              
##  [21] xfun_0.36                 crayon_1.5.2             
##  [23] RCurl_1.98-1.9            jsonlite_1.8.4           
##  [25] glue_1.6.2                gtable_0.3.1             
##  [27] gargle_1.2.1              zlibbioc_1.42.0          
##  [29] XVector_0.36.0            DelayedArray_0.22.0      
##  [31] BiocSingular_1.12.0       scales_1.2.1             
##  [33] DBI_1.1.3                 Rcpp_1.0.10              
##  [35] isoband_0.2.7             viridisLite_0.4.1        
##  [37] dqrng_0.3.0               rsvd_1.0.5               
##  [39] DT_0.27                   metapod_1.4.0            
##  [41] htmlwidgets_1.6.1         httr_1.4.4               
##  [43] RColorBrewer_1.1-3        ellipsis_0.3.2           
##  [45] pkgconfig_2.0.3           XML_3.99-0.13            
##  [47] farver_2.1.1              sass_0.4.5               
##  [49] dbplyr_2.3.0              locfit_1.5-9.7           
##  [51] utf8_1.2.2                tidyselect_1.2.0         
##  [53] labeling_0.4.2            rlang_1.0.6              
##  [55] munsell_0.5.0             cellranger_1.1.0         
##  [57] tools_4.2.2               cachem_1.0.6             
##  [59] xgboost_1.7.3.1           cli_3.6.0                
##  [61] generics_0.1.3            broom_1.0.2              
##  [63] evaluate_0.20             fastmap_1.1.0            
##  [65] yaml_2.3.7                knitr_1.41               
##  [67] fs_1.6.0                  sparseMatrixStats_1.8.0  
##  [69] xml2_1.3.3                compiler_4.2.2           
##  [71] rstudioapi_0.14           beeswarm_0.4.0           
##  [73] reprex_2.0.2              statmod_1.5.0            
##  [75] bslib_0.4.2               stringi_1.7.12           
##  [77] highr_0.10                lattice_0.20-45          
##  [79] bluster_1.6.0             Matrix_1.5-3             
##  [81] vctrs_0.5.2               pillar_1.8.1             
##  [83] lifecycle_1.0.3           jquerylib_0.1.4          
##  [85] BiocNeighbors_1.14.0      data.table_1.14.6        
##  [87] bitops_1.0-7              irlba_2.3.5.1            
##  [89] rtracklayer_1.56.1        R6_2.5.1                 
##  [91] BiocIO_1.6.0              gridExtra_2.3            
##  [93] vipor_0.4.5               codetools_0.2-18         
##  [95] MASS_7.3-58.1             assertthat_0.2.1         
##  [97] rjson_0.2.21              withr_2.5.0              
##  [99] GenomicAlignments_1.32.1  Rsamtools_2.12.0         
## [101] GenomeInfoDbData_1.2.8    parallel_4.2.2           
## [103] hms_1.1.2                 grid_4.2.2               
## [105] beachmat_2.12.0           rmarkdown_2.20           
## [107] DelayedMatrixStats_1.18.2 googledrive_2.0.0        
## [109] lubridate_1.9.1           ggbeeswarm_0.7.1         
## [111] restfulr_0.0.15
knitr::knit_exit()